
#Asma Binte Afzal(Main Author and Contributor), Graduate Research Assistant,School of Public Health, IU Bloomington
#Dear users, please reach out if you need any help with the codes or clarification, thank you for your interest.

#final data save
save(swan_lng_clean, file = "swan_lng_clean.RData")

load("swan_lng_clean.RData")

summary(swan_lng_clean)

# DATA CLEANING
# Convert ordinal variables to ordered factors

swan_lng_clean <- swan_lng_clean %>%
  mutate(
    FEELBLU = factor(FEELBLU, 
                     levels = c("Not at all", "1-5 days", "6-8 days", "9-13 days", "Every day"), 
                     ordered = TRUE),
    DEPRESS = factor(DEPRESS, 
                     levels = c("Rarely/none of the time", "Some/a little of the time", 
                                "Occasionally/moderate amount of the time", "Most/all of the time"), 
                     ordered = TRUE),
    RESTLES = factor(RESTLES, 
                     levels = c("Rarely/none of the time", "Some/a little of the time", 
                                "Occasionally/moderate amount of the time", "Most/all of the time"), 
                     ordered = TRUE),
    SLEEPQL = factor(SLEEPQL, 
                     levels = c("Very good", "Fairly good", "Fairly bad", "Very bad"), 
                     ordered = TRUE),
    GLASLIQ = factor(GLASLIQ, 
                     levels = c("None or less than 1/month", "1-3/month", "1/week", 
                                "2-4/week", "5-6/week", "1/day", "2-3/day", "4/day", 
                                "Equal to or more than 5/day"), 
                     ordered = TRUE)
  )


# Convert character variables to factors
swan_lng_clean <- swan_lng_clean %>%
  mutate(
    GLASLIQ = factor(GLASLIQ, 
                     levels = c("None or less than 1/month", "1-3/month", "1/week", 
                                "2-4/week", "5-6/week", "1/day", "2-3/day", 
                                "4/day", "Equal to or more than 5/day"), 
                     ordered = TRUE),  # Convert to ordered factor
    ALCO24H = factor(ALCO24H, levels = c("No", "Yes")),
    AGE_Group = factor(AGE_Group, levels = c("1", "2", "3"))  # Ensure AGE_Group is categorical
  )

# Verify conversion
summary(swan_lng_clean)



str(swan_lng_clean)
summary(swan_lng_clean)

table(swan_lng_clean$SMOKERE)
table(swan_lng_clean$ALCO24H)

# Repeat for all variables in the formula
# Count the levels for each variable in the dataset
cat("SMOKERE:\n")
print(table(swan_lng_clean$SMOKERE))

cat("\nALCO24H:\n")
print(table(swan_lng_clean$ALCO24H))

cat("\nALCHL24R:\n")
print(table(swan_lng_clean$ALCHL24R))

cat("\nDRNKBEE:\n")
print(table(swan_lng_clean$DRNKBEE))

cat("\nSLEEP1M:\n")
print(table(swan_lng_clean$SLEEP1M))

cat("\nRESTLES:\n")
print(table(swan_lng_clean$RESTLES))

cat("\nSLEEPQL:\n")
print(table(swan_lng_clean$SLEEPQL))

cat("\nDEPRESS:\n")
print(table(swan_lng_clean$DEPRESS))

cat("\nNRVOUS:\n")
print(table(swan_lng_clean$NRVOUS))

cat("\nAVCIGDA:\n")
print(table(swan_lng_clean$AVCIGDA)) # For numeric variables, you might prefer summary()

cat("\nAGE_Group:\n")
print(table(swan_lng_clean$AGE_Group)) # For numeric variables, you might prefer summary()

cat("\nRACE:\n")
print(table(swan_lng_clean$RACE))

cat("\nGLASLIQ:\n")
print(table(swan_lng_clean$GLASLIQ))


# Verify numeric variables
swan_lng_clean <- swan_lng_clean %>%
  mutate(
    AVCIGDA = as.numeric(AVCIGDA)
  )
cat("\nAVCIGDA:\n")
print(table(swan_lng_clean$AVCIGDA))

summary(swan_lng_clean$AVCIGDA)
hist(swan_lng_clean$AVCIGDA, breaks = 20)


swan_lng_clean$GLASLIQ <- droplevels(swan_lng_clean$GLASLIQ)

swan_lng_clean$DRNKBEE <- factor(swan_lng_clean$DRNKBEE)

swan_lng_clean$AGE_Group <- cut(
  swan_lng_clean$AGE,
  breaks = c(44, 50, 60, 70),
  labels = c("1", "2", "3")
)

print(table(swan_lng_clean$AVCIGDA ))

# Categorize AVCIGDA into groups
swan_lng_clean$AVCIGDA_Group <- cut(
  swan_lng_clean$AVCIGDA,
  breaks = c(-Inf, 0, 5, 10, 20, Inf),
  labels = c("None", "1-5", "6-10", "11-20", "20+"),
  right = FALSE
)



# Cross-tabulation of AVCIGDA_Group (categories) with RACE
table(swan_lng_clean$AVCIGDA_Group, swan_lng_clean$RACE)

# Three-way cross-tabulation
tabyl(swan_lng_clean, AVCIGDA_Group, RACE, AGE_Group)

ggplot(swan_lng_clean, aes(x = AVCIGDA_Group, fill = RACE)) +
  geom_bar(position = "fill") +
  facet_wrap(~ AGE_Group) +
  labs(title = "Proportion of Cigarette Consumption by Race and Age Group",
       x = "Cigarettes/Day (Grouped)", y = "Proportion") +
  scale_y_continuous(labels = scales::percent)




##ANALYSIS

#Primary Hypothesis:

# Model for Primary Hypothesis


simpler_model <- clm(
  FEELBLU ~ SMOKERE + ALCO24H + ALCHL24R + DRNKBEE + SLEEP1M + RESTLES + SLEEPQL + DEPRESS + NRVOUS + RESTLES + AVCIGDA_Group + AGE_Group + RACE,
  data = swan_lng_clean,
  control = clm.control(maxIter = 1e4, gradTol = 1e-6)
)

# Model Summary
summary(simpler_model)

model <- clmm(
  FEELBLU ~ SMOKERE + ALCO24H + ALCHL24R + DRNKBEE + SLEEP1M + RESTLES + SLEEPQL + 
    DEPRESS + NRVOUS + RESTLES + AVCIGDA_Group + AGE_Group + RACE + visit + (1 | SWANID),
  data = swan_lng_clean,
  control = clmm.control(
    maxIter = 1e4,
    gradTol = 1e-6,
    useMatrix = TRUE
  )
)
# Model Summary
summary(model)

#data visualization SLEEPQL vs. FEELBLU over time (visit)
#Expand grid for SLEEPQL and visit and visit
# Expand grid for predictors
# Create new data grid
new_data <- expand.grid(
  SLEEPQL = factor(c("Very good", "Fairly good", "Fairly bad", "Very bad"),
                   levels = c("Very good", "Fairly good", "Fairly bad", "Very bad")),
  NRVOUS = factor(c("Not at all", "1-5 days", "6-8 days", "9-13 days", "Every day"),
                  levels = c("Not at all", "1-5 days", "6-8 days", "9-13 days", "Every day")),
  visit = seq(min(swan_lng_clean$visit), max(swan_lng_clean$visit), by = 1),
  SMOKERE = "No",
  ALCO24H = "No",
  ALCHL24R = "No",
  DRNKBEE = "No",
  SLEEP1M = "No",
  RESTLES = "Rarely/none of the time",
  AVCIGDA_Group = "None",
  AGE_Group = "1",
  RACE = "Caucasian/White Non-Hispanic"
)


# Predict probabilities for FEELBLU categories
# Extract thresholds (Theta values)
thresholds <- model$Theta

# Extract model coefficients
# Extract all coefficients from the model
coefs <- coef(model)

# Create a complete coefficient vector with zeros for missing terms
matching_coefs <- rep(0, ncol(X))
names(matching_coefs) <- colnames(X)
matching_coefs[names(coefs)] <- coefs

# Verify that there are no NA values
print(matching_coefs)

length(matching_coefs)
print(names(matching_coefs))
print(colnames(X))

matching_coefs <- matching_coefs[colnames(X)]
missing_cols <- setdiff(colnames(X), names(matching_coefs))
matching_coefs[missing_cols] <- 0
matching_coefs <- matching_coefs[colnames(X)]  # Reorder to match X

linear_predictors <- X %*% matching_coefs
print(head(linear_predictors))  # Verify the output


# Add linear predictors to new_data for plotting
new_data$linear_predictors <- as.numeric(linear_predictors)

# Plot linear predictors by visit for each SLEEPQL and NRVOUS level
library(ggplot2)
ggplot(new_data, aes(x = visit, y = linear_predictors, color = SLEEPQL, group = SLEEPQL)) +
  geom_line() +
  facet_wrap(~ NRVOUS, scales = "free_y", ncol = 1) +
  labs(
    title = "Effect of SLEEPQL and NRVOUS on FEELBLU Over Visits",
    x = "Visit",
    y = "Linear Predictor",
    color = "SLEEPQL"
  ) +
  theme_minimal()



####
# Expand grid for SLEEPQL and visit
new_data <- expand.grid(
  SLEEPQL = c("Very good", "Fairly good", "Fairly bad", "Very bad"),
  visit = seq(min(swan_lng_clean$visit), max(swan_lng_clean$visit), by = 1),
  DEPRESS = mean(as.numeric(swan_lng_clean$DEPRESS)),
  SMOKERE = "No",
  ALCO24H = "No",
  ALCHL24R = "No",
  DRNKBEE = "No",
  SLEEP1M = "No",
  RESTLES = "Rarely/none of the time",
  AVCIGDA_Group = "None",
  AGE_Group = "1",
  RACE = "Caucasian/White Non-Hispanic"
)

# Predict probabilities
predicted_probs <- predict(model, new_data, type = "response")

# Reshape and plot
library(reshape2)
library(ggplot2)

plot_data <- cbind(new_data, predicted_probs)
plot_data_long <- melt(
  plot_data,
  id.vars = c("SLEEPQL", "visit"),
  variable.name = "FEELBLU",
  value.name = "Probability"
)

ggplot(plot_data_long, aes(x = visit, y = Probability, color = FEELBLU, group = FEELBLU)) +
  geom_line() +
  facet_wrap(~SLEEPQL, nrow = 2) +
  labs(
    title = "Effect of SLEEPQL on FEELBLU Across Visits",
    x = "Visit",
    y = "Predicted Probability",
    color = "FEELBLU Categories"
  ) +
  theme_minimal()



# Code to Refine the Plot and Calculate Predicted Probabilities

# Extract thresholds (Theta) from the model

thresholds <- coef(model)[grep("\\|", names(coef(model)))]  # Extract thresholds only

# Define the logistic function

logistic <- function(x) 1 / (1 + exp(-x))

# Create a dataset for prediction
new_data <- expand.grid(
  SLEEPQL = c("Very good", "Fairly good", "Fairly bad", "Very bad"),
  NRVOUS = c("Not at all", "1-5 days", "6-8 days", "9-13 days", "Every day"),
  visit = seq(min(swan_lng_clean$visit), max(swan_lng_clean$visit), by = 1)
)

# Model matrix for linear predictors
X <- model.matrix(~ SLEEPQL + NRVOUS + visit, data = new_data, contrasts.arg = attr(model.frame(model), "contrasts"))

# Match coefficients with model matrix
coefs <- coef(model)
matching_coefs <- rep(0, ncol(X))
names(matching_coefs) <- colnames(X)
matching_coefs[names(coefs)] <- coefs
matching_coefs <- matching_coefs[colnames(X)]  # Reorder to match X

# Calculate linear predictors
linear_predictors <- X %*% matching_coefs

# Calculate cumulative probabilities
cumulative_probs <- sapply(thresholds, function(threshold) {
  logistic(threshold - linear_predictors)
})

# Calculate category probabilities
# Ensure cumulative_probs is a matrix
cumulative_probs <- matrix(
  sapply(thresholds, function(threshold) logistic(threshold - linear_predictors)),
  ncol = length(thresholds),
  byrow = FALSE
)

# Check dimensions
print(dim(cumulative_probs))  # Should match the number of rows in linear_predictors

# Calculate category probabilities
# Ensure cumulative_probs is a matrix
cumulative_probs <- matrix(
  sapply(thresholds, function(threshold) logistic(threshold - linear_predictors)),
  ncol = length(thresholds),
  byrow = FALSE
)

# Check dimensions
print(dim(cumulative_probs))  # Should match the number of rows in linear_predictors

# Calculate category probabilities
# Ensure cumulative_probs is a matrix
cumulative_probs <- matrix(
  sapply(thresholds, function(threshold) logistic(threshold - linear_predictors)),
  ncol = length(thresholds),
  byrow = FALSE
)

# Check dimensions
print(dim(cumulative_probs))  # Should match the number of rows in linear_predictors

# Calculate category probabilities
category_probs <- cbind(
  cumulative_probs[, 1],                       # Probability of the first category
  cumulative_probs[, 2] - cumulative_probs[, 1], # Probability of the second category
  cumulative_probs[, 3] - cumulative_probs[, 2], # Probability of the third category
  cumulative_probs[, 4] - cumulative_probs[, 3], # Probability of the fourth category
  1 - cumulative_probs[, 4]                    # Probability of the last category
)

# Convert to a data frame and name columns
category_probs <- as.data.frame(category_probs)
colnames(category_probs) <- c("Not at all", "1-5 days", "6-8 days", "9-13 days", "Every day")

# Check the result
print(head(category_probs))

# Combine probabilities with new_data
plot_data <- cbind(new_data, category_probs)

# Reshape into long format
library(tidyr)
plot_data_long <- pivot_longer(
  plot_data,
  cols = c("Not at all", "1-5 days", "6-8 days", "9-13 days", "Every day"),
  names_to = "FEELBLU_Category",
  values_to = "Probability"
)

# Check the reshaped data
print(head(plot_data_long))

# Ensure SLEEPQL is an ordered factor with correct levels
plot_data_long$SLEEPQL <- factor(
  plot_data_long$SLEEPQL,
  levels = c("Very good", "Fairly good", "Fairly bad", "Very bad"),
  ordered = TRUE
)

library(viridis)



# MODEL without visit fixed effect

model_wv <- clmm(
  FEELBLU ~ SMOKERE + ALCO24H + ALCHL24R + DRNKBEE + SLEEP1M + RESTLES + SLEEPQL + 
    DEPRESS + NRVOUS + RESTLES + AVCIGDA_Group + AGE_Group + RACE + (1 | SWANID),
  data = swan_lng_clean,
  control = clmm.control(
    maxIter = 1e4,  # Increase max iterations
    gradTol = 1e-6  # Relax gradient tolerance
  )
)



# Model Summary
summary(model_wv)


# Perform Likelihood Ratio Test
anova(model, model_wv)

# Compare AIC for multiple models
AIC(model, model_wv)

# Compare BIC for multiple models
BIC(model, model_wv)






# Plot the effect of DEPRESS on FEELBLU
# Define logistic function
logistic <- function(x) {
  1 / (1 + exp(-x))
}

# Create a grid of predictor values
new_data <- expand.grid(
  SLEEPQL = c("Very good", "Fairly good", "Fairly bad", "Very bad"),  # SLEEPQL levels
  DEPRESS = mean(as.numeric(swan_lng_clean$DEPRESS)),  # Fixed at mean level
  SMOKERE = "No",
  ALCO24H = "No",
  ALCHL24R = "No",
  DRNKBEE = "No",
  SLEEP1M = "No",
  RESTLES = "Rarely/none of the time",
  AVCIGDA_Group = "None",
  AGE_Group = "1",
  RACE = "Caucasian/White Non-Hispanic"
)

# Extract thresholds
thresholds <- model$Theta

# Calculate linear predictors for each SLEEPQL level
linear_predictors <- new_data %>%
  mutate(
    SLEEPQL_numeric = case_when(
      SLEEPQL == "Very good" ~ 1,
      SLEEPQL == "Fairly good" ~ 2,
      SLEEPQL == "Fairly bad" ~ 3,
      SLEEPQL == "Very bad" ~ 4
    ),
    linear_predictor = SLEEPQL_numeric  # Adjust based on your model coefficients
  )

# Compute cumulative probabilities for each threshold
cumulative_probs <- sapply(thresholds, function(threshold) {
  logistic(threshold - linear_predictors$linear_predictor)
})

# Convert cumulative probabilities to specific category probabilities
category_probs <- cbind(
  "Not at all" = cumulative_probs[, 1],
  "1-5 days" = cumulative_probs[, 2] - cumulative_probs[, 1],
  "6-8 days" = cumulative_probs[, 3] - cumulative_probs[, 2],
  "9-13 days" = cumulative_probs[, 4] - cumulative_probs[, 3],
  "Every day" = 1 - cumulative_probs[, 4]
)

# Combine probabilities with data for visualization
plot_data <- cbind(new_data, category_probs)
plot_data_long <- reshape2::melt(
  plot_data,
  id.vars = c("SLEEPQL"),
  measure.vars = c("Not at all", "1-5 days", "6-8 days", "9-13 days", "Every day"),
  variable.name = "FEELBLU",
  value.name = "Probability"
)

# Visualization
ggplot(plot_data_long, aes(x = SLEEPQL, y = Probability, fill = FEELBLU)) +
  geom_bar(stat = "identity", position = "stack") +
  labs(
    title = "Changes in FEELBLU Categories by SLEEPQL Levels",
    x = "SLEEPQL Levels",
    y = "Predicted Probability",
    fill = "FEELBLU Categories"
  ) +
  theme_minimal()





# Model Diagnostics
# Extract fitted probabilities
fitted_probs <- fitted(model)
str(fitted_probs)

thresholds <- model$Theta
print(thresholds)

#Step 1: Map Probabilities
predicted_categories <- cut(
  fitted_probs,
  breaks = c(-Inf, thresholds, Inf),
  labels = c("Not at all", "1-5 days", "6-8 days", "9-13 days", "Every day"),
  right = TRUE
)

#Step 2: Compare with Observed Categories
observed_categories <- as.character(swan_lng_clean$FEELBLU)

#Step 3: Create a Confusion Matrix
length(observed_categories)
length(predicted_categories)

# Rows included in the model
included_rows <- !is.na(fitted_probs)

# Adjust observed categories to match included rows
observed_categories <- as.character(swan_lng_clean$FEELBLU[included_rows])


length(included_rows) == nrow(swan_lng_clean)  # Should return TRUE

length(observed_categories)  # Should now match predicted_categories

observed_categories <- observed_categories[1:length(predicted_categories)]

confusion_matrix <- table(Observed = observed_categories, Predicted = predicted_categories)
print(confusion_matrix)


# Calculate accuracy
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
print(paste("Accuracy:", round(accuracy * 100, 2), "%"))

#Step 4: Visualize the Results: Visualize the observed vs. predicted categories 
#using a bar plot:

# Create a data frame for visualization
comparison_df <- data.frame(
  Observed = factor(observed_categories, levels = c("Not at all", "1-5 days", "6-8 days", "9-13 days", "Every day")),
  Predicted = factor(predicted_categories, levels = c("Not at all", "1-5 days", "6-8 days", "9-13 days", "Every day"))
)

# Plot observed vs predicted categories
ggplot(comparison_df, aes(x = Observed, fill = Predicted)) +
  geom_bar(position = "fill") +
  labs(title = "Observed vs Predicted Categories", x = "Observed Categories", y = "Proportion") +
  scale_fill_discrete(name = "Predicted Categories")



#Random Effects Check: Plot random effects (SWANID) to check for variability:
ranef_values <- ranef(model)$SWANID
hist(ranef_values, main = "Histogram of Random Effects (SWANID)", xlab = "Random Effects")




# Pseudo-R²
r2_primary <- r2_nakagawa(model)
print(r2_primary)

#Analysiis

#Data check and clean
sapply(swan_lng_clean[, c("ALCO24H", "ALCHL24R", "DRNKBEE", "GLASLIQ", "visit")], levels)

str(swan_lng_clean$visit)
# Explicitly convert GLASLIQ to a nominal (unordered) factor
swan_lng_clean$GLASLIQ <- factor(as.character(swan_lng_clean$GLASLIQ), levels = unique(as.character(swan_lng_clean$GLASLIQ)))

# Check the structure to confirm it's now a regular factor
str(swan_lng_clean$GLASLIQ)


swan_lng_clean$GLASLIQ <- factor(swan_lng_clean$GLASLIQ, levels = unique(swan_lng_clean$GLASLIQ))

sapply(swan_lng_clean[, c("ALCO24H", "ALCHL24R", "DRNKBEE", "GLASLIQ", "visit")], table)
sapply(swan_lng_clean[, c("FEELBLU", "ALCO24H", "ALCHL24R", "DRNKBEE", "GLASLIQ", "visit")], function(x) sum(is.na(x)))


# Alcohol Model
alcohol_model <- clmm(
  FEELBLU ~ ALCO24H + ALCHL24R + DRNKBEE * visit + (1 | SWANID),
  data = swan_lng_clean,
  control = clmm.control(
    maxIter = 1e4,
    gradTol = 1e-6,
    useMatrix = TRUE
  )
)

alcohol_model <- clmm(
  FEELBLU ~ ALCO24H + ALCHL24R + DRNKBEE + (0 + visit | SWANID),
  data = swan_lng_clean,
  control = clmm.control(maxIter = 1e4, gradTol = 1e-6, useMatrix = TRUE)
)

# Summary
summary(alcohol_model)

# Diagnostics
fitted_probs <- predict(alcohol_model, type = "prob")

# Observed proportions by FEELBLU category
observed_probs <- prop.table(table(swan_lng_clean$FEELBLU), margin = 1)

# Combine observed and predicted for comparison
comparison <- data.frame(
  Category = levels(swan_lng_clean$FEELBLU),
  Predicted = colMeans(fitted_probs),
  Observed = observed_probs
)

print(comparison)


#Secondary Hypothesis 1: Smoking and Depression Symptoms

#Hypothesis:Women who smoke are more likely to report depression symptoms 
#compared to non-smokers over time.

# Smoking Model
smoking_model <- clmm(
  FEELBLU ~ SMOKERE + SMOKERE * visit + AVCIGDA * visit + (1 | SWANID),
  data = swan_lng_clean,
  control = clmm.control(
    maxIter = 2e4,
    gradTol = 1e-6,
    useMatrix = TRUE
  )
)

summary(smoking_model)

smoking_model_M <- clmm(
  FEELBLU ~ SMOKERE + AVCIGDA + (0 + visit | SWANID),
  data = swan_lng_clean,
  control = clmm.control(maxIter = 1e4, gradTol = 1e-6, useMatrix = TRUE)
)



# Summary
summary(smoking_model_M)

# Adjusted Model with Simplified Random Effects and Alternative Optimizer
adjusted_smoking_model <- clmm(
  FEELBLU ~ SMOKERE * visit + AVCIGDA_Group * visit + (1 | SWANID),  # Simplified fixed effects structure
  data = swan_lng_clean,
  control = clmm.control(
    maxIter = 2e4,        # Increased the maximum number of iterations
    gradTol = 1e-6,       # Gradient tolerance (same as original)
    method = "nlminb"     # Alternative optimizer for better convergence
  )
)

summary(adjusted_smoking_model)

# Summary of the adjusted model
summary(adjusted_smoking_model)


# Summary
summary(smoking_model_1)

# optimizer "nlminb"
smoking_model_nlminb <- clmm(
  FEELBLU ~ SMOKERE + AVCIGDA + (1 | SWANID),
  data = swan_lng_clean,
  control = clmm.control(
    maxIter = 2e4,
    gradTol = 1e-6,
    useMatrix = TRUE,
    method = "nlminb"  # Default optimizer
  )
)
summary(smoking_model_nlminb)

# optimizer "ucminf"
smoking_model_ucminf <- clmm(
  FEELBLU ~ SMOKERE + AVCIGDA + (1 | SWANID),
  data = swan_lng_clean,
  control = clmm.control(
    maxIter = 2e4,
    gradTol = 1e-6,
    useMatrix = TRUE,
    method = "ucminf"  # Alternative optimizer
  )
)
summary(smoking_model_ucminf)

#SLEEP MODEL

sleep_model_refined <- clmm(
  FEELBLU ~ SLEEPQL * visit + RESTLES * visit + SLEEP1M * visit + visit + (1 | SWANID),
  data = swan_lng_clean,
  control = clmm.control(
    maxIter = 2e4,       # Increase maximum iterations
    gradTol = 1e-6,      # Keep gradient tolerance strict
    method = "nlminb"    # Use a more robust optimizer
  )
)

summary(sleep_model_refined)

# Sleep Model
sleep_model <- clmm(
  FEELBLU ~ SLEEPQL*visit + RESTLES * visit + SLEEP1M * visit + visit + (1 | SWANID),
  data = swan_lng_clean,
  control = clmm.control(
    maxIter = 1e4,
    gradTol = 1e-6,
    useMatrix = TRUE
  )
)

# Summary
summary(sleep_model)

# Diagnostics
residuals_sleep <- simulateResiduals(sleep_model)
plot(residuals_sleep)
testUniformity(residuals_sleep)
testDispersion(residuals_sleep)
VarCorr(sleep_model)


#Secondary Hypothesis 4: Feeling Nervous and Depression Symptoms

# Nervous Model
nervous_model_time <- clmm(
  FEELBLU ~ NRVOUS + DEPRESS + visit + NRVOUS:visit + DEPRESS:visit + (1 | SWANID),
  data = swan_lng_clean,
  control = clmm.control(
    maxIter = 3e4,
    gradTol = 1e-6,
    useMatrix = TRUE
  )
)
# Summary
summary(nervous_model_time)




# Diagnostics
residuals_nervous <- simulateResiduals(nervous_model_time)
plot(residuals_nervous)
testUniformity(residuals_nervous)
testDispersion(residuals_nervous)
VarCorr(nervous_model_time)

#Alcohol model

alcohol_model <- clmm(
  FEELBLU ~ ALCO24H + ALCHL24R + DRNKBEE + (0 + visit | SWANID),
  data = swan_lng_clean,
  control = clmm.control(maxIter = 1e4, gradTol = 1e-6, useMatrix = TRUE)
)

summary(alcohol_model)



#Secondary Hypothesis 5: Changes in Lifestyle Behaviors and Depression Symptoms
#Hypothesis: Changes in lifestyle behaviors over time are associated with 
#changes in depression symptoms from visits 1 to 10.

# Time Change Model
time_change_model <- clmm(
  FEELBLU ~ SMOKERE * visit + ALCO24H + ALCHL24R + DRNKBEE + SLEEPQL * visit + RESTLES + SLEEP1M + 
    DEPRESS * visit + NRVOUS * visit + AVCIGDA + AGE + RACE * visit + (1 | SWANID),
  data = swan_lng_clean,
  control = clmm.control(
    maxIter = 1e4,
    gradTol = 1e-6,
    useMatrix = TRUE
  )
)

# Summary
summary(time_change_model)


# Time Change Model
time_change_model_1 <- clmm(
  FEELBLU ~ ALCO24H + ALCHL24R + SMOKERE * visit + DRNKBEE * SMOKERE + SLEEPQL * SLEEP1M + RESTLES * visit + SLEEP1M + 
    DEPRESS * visit + NRVOUS * visit + AVCIGDA * NRVOUS + AGE + RACE * DEPRESS + (1 | SWANID),
  data = swan_lng_clean,
  control = clmm.control(
    maxIter = 1e4,
    gradTol = 1e-6,
    useMatrix = TRUE
  )
)

# Summary
summary(time_change_model_1)
# Diagnostics
residuals_time_change <- simulateResiduals(time_change_model)
plot(residuals_time_change)
testUniformity(residuals_time_change)
testDispersion(residuals_time_change)
VarCorr(time_change_model)

# Time Change Model(THIS MODEL USED AS FINAL MODEL)
time_change_model_2 <- clmm(
  FEELBLU ~ ALCO24H + ALCHL24R + SMOKERE * visit + DRNKBEE * SMOKERE + SLEEPQL * SLEEP1M + RESTLES * visit + SLEEP1M * AGE + 
    DEPRESS * visit + NRVOUS * visit + AVCIGDA * NRVOUS + AGE * DEPRESS + RACE * DEPRESS + (1 | SWANID),
  data = swan_lng_clean,
  control = clmm.control(
    maxIter = 1e4,
    gradTol = 1e-6,
    useMatrix = TRUE
  )
)

# Summary
summary(time_change_model_2)


summary(swan_lng_clean)
summary(swan_lng_clean$SLEEPQL)
summary(swan_lng_clean$NRVOUS)
library(ordinal)

time_change_model_cloglog <- clmm(
  FEELBLU ~ ALCO24H + ALCHL24R + SMOKERE * visit + DRNKBEE * SMOKERE +
    SLEEPQL * SLEEP1M + RESTLES * visit + SLEEP1M * AGE +
    DEPRESS * visit + NRVOUS * visit + AVCIGDA * NRVOUS +
    RACE * DEPRESS + (1 | SWANID),
  data = swan_lng_clean,
  link = "cloglog", # Specify the complementary log-log link
  control = clmm.control(
    maxIter = 1e4,
    gradTol = 1e-6,
    useMatrix = TRUE
  )
)

# Summary
summary(time_change_model_cloglog)

#Comparison of two models using two different link function

# Extract log-likelihood, AIC, and BIC for both models
logit_stats <- c(logLik = logLik(time_change_model_2),
                 AIC = AIC(time_change_model_2),
                 BIC = BIC(time_change_model_2))

cloglog_stats <- c(logLik = logLik(time_change_model_cloglog),
                   AIC = AIC(time_change_model_cloglog),
                   BIC = BIC(time_change_model_cloglog))

# Combine statistics into a comparison table
model_comparison <- data.frame(
  Metric = c("Log-Likelihood", "AIC", "BIC"),
  Logit_Model = c(logit_stats["logLik"], logit_stats["AIC"], logit_stats["BIC"]),
  Cloglog_Model = c(cloglog_stats["logLik"], cloglog_stats["AIC"], cloglog_stats["BIC"])
)

print(model_comparison)


#Likelihood Ratio Test

# Likelihood ratio test
anova(time_change_model_2, time_change_model_cloglog)

#COMPARE Model Predictions
# Generate predicted probabilities for both models
logit_preds <- predict(time_change_model_2, type = "response")
cloglog_preds <- predict(time_change_model_cloglog, type = "response")

# Compare mean squared error (MSE) of predictions
logit_mse <- mean((logit_preds - swan_lng_clean$FEELBLU_numeric)^2, na.rm = TRUE)
cloglog_mse <- mean((cloglog_preds - swan_lng_clean$FEELBLU_numeric)^2, na.rm = TRUE)

mse_comparison <- data.frame(
  Model = c("Logit", "Cloglog"),
  MSE = c(logit_mse, cloglog_mse)
)

print(mse_comparison)



#Diagnosis of "time_change_model_2"
#Residual diagnostics

levels(swan_lng_clean$FEELBLU)
summary(swan_lng_clean$FEELBLU)

#Test for independent random effect


# Calculate category-specific probabilities from cumulative probabilities
n_categories <- 5 # Number of ordinal categories
predicted_probs_matrix <- matrix(NA, nrow = length(predicted_probs), ncol = n_categories)

for (k in 1:n_categories) {
  if (k == 1) {
    predicted_probs_matrix[, k] <- predicted_probs
  } else {
    predicted_probs_matrix[, k] <- predicted_probs - c(predicted_probs[-1], 0)
  }
}

# Normalize probabilities to ensure they sum to 1
predicted_probs_matrix <- sweep(predicted_probs_matrix, 1, rowSums(predicted_probs_matrix), FUN = "/")

#Step 2: Generate Residuals
generate_residuals_categorical <- function(observed, predicted_probs_matrix) {
  residuals <- numeric(length(observed))
  
  for (i in seq_along(observed)) {
    obs_category <- observed[i]
    lower_bound <- if (obs_category == 1) 0 else sum(predicted_probs_matrix[i, 1:(obs_category - 1)])
    upper_bound <- sum(predicted_probs_matrix[i, 1:obs_category])
    
    # Generate uniform residuals within the bounds
    residuals[i] <- runif(1, lower_bound, upper_bound)
  }
  
  qnorm(residuals) # Transform to normal scale
}

# Apply the function
residuals_conditional <- generate_residuals_categorical(observed_categories, predicted_probs_matrix)

# View the first few rows of the predicted probabilities matrix
head(predicted_probs_matrix)

# Check if there are any NA or negative values in the matrix
anyNA(predicted_probs_matrix)
any(predicted_probs_matrix < 0)

# Replace negative values with 0
predicted_probs_matrix[predicted_probs_matrix < 0] <- 0

# Normalize probabilities to ensure they sum to 1
predicted_probs_matrix <- sweep(predicted_probs_matrix, 1, rowSums(predicted_probs_matrix), FUN = "/")

# Check the updated probabilities
any(predicted_probs_matrix < 0) # Should be FALSE
rowSums(predicted_probs_matrix) # Should be close to 1 for each row

# Generate residuals function
generate_residuals_categorical <- function(observed, predicted_probs_matrix) {
  residuals <- numeric(length(observed))
  
  for (i in seq_along(observed)) {
    obs_category <- observed[i]
    # Calculate cumulative probabilities
    lower_bound <- if (obs_category == 1) 0 else sum(predicted_probs_matrix[i, 1:(obs_category - 1)])
    upper_bound <- sum(predicted_probs_matrix[i, 1:obs_category])
    
    # Generate uniform residuals within the bounds
    residuals[i] <- runif(1, lower_bound, upper_bound)
  }
  
  qnorm(residuals) # Transform to normal scale
}

# Apply the function to generate residuals
residuals_conditional <- generate_residuals_categorical(observed_categories, predicted_probs_matrix)

# Validate residuals
summary(residuals_conditional)
# Save the histogram as a PNG file
png("Histogram_of_Residuals.png", width = 800, height = 600)
hist(residuals_conditional, main = "Histogram of Residuals", xlab = "Residuals")
dev.off()

# Save the histogram
png("Histogram_of_Cleaned_Residuals.png", width = 800, height = 600)
hist(residuals_conditional_clean, main = "Histogram of Residuals", xlab = "Residuals", breaks = 30)
dev.off()


# Clean residuals by removing Inf and NaN
residuals_conditional_clean <- residuals_conditional[is.finite(residuals_conditional)]

# Validate cleaned residuals
summary(residuals_conditional_clean)


# Save the ACF plot
# Save the ACF plot
png("ACF_of_Cleaned_Residuals.png", width = 800, height = 600)
acf(residuals_conditional_clean, main = "ACF of Residuals", lag.max = 20)
dev.off()


# Save the PACF plot
png("PACF_of_Cleaned_Residuals.png", width = 800, height = 600)
pacf(residuals_conditional_clean, main = "PACF of Residuals", lag.max = 20)
dev.off()



# Example: Compute ACF for residuals grouped by time
library(dplyr)
grouped_residuals <- data.frame(time = visit, residuals = residuals_conditional) %>%
  group_by(time) %>%
  summarise(mean_residuals = mean(residuals, na.rm = TRUE))

# Convert aggregated residuals to time series
residuals_aggregated_ts <- ts(grouped_residuals$mean_residuals)

# Plot ACF and PACF for aggregated residuals
acf(residuals_aggregated_ts, main = "ACF of Aggregated Residuals", lag.max = 20)
pacf(residuals_aggregated_ts, main = "PACF of Aggregated Residuals", lag.max = 20)



# Extract fitted cumulative probabilities
fitted_probs_cum <- predict(time_change_model_2, type = "response")

# Extract observed categories as numeric levels
observed <- as.numeric(swan_lng_clean$FEELBLU)

# Compute generalized residuals
generalized_residuals <- qnorm(fitted_probs_cum[cbind(seq_along(observed), observed)])

# Ensure no infinite values (due to boundary predictions)
generalized_residuals[is.infinite(generalized_residuals)] <- NA

# Check structure
str(generalized_residuals)


# Extract fitted values as expected category means
fitted_values <- predict(time_change_model_2, type = "class")

length(fitted_values)
length(generalized_residuals)

# Ensure observed is the same length as fitted_values
observed <- as.numeric(swan_lng_clean$FEELBLU[1:13474])

# Extract cumulative probabilities for each observed category
generalized_residuals <- qnorm(fitted_probs_cum[cbind(seq_along(observed), observed)])

# Replace infinite values due to boundary predictions
generalized_residuals[is.infinite(generalized_residuals)] <- NA

length(fitted_values) == length(generalized_residuals)

str(fitted_probs_cum)
dim(fitted_probs_cum)  # Confirm if this is a matrix or vector

n_categories <- length(levels(swan_lng_clean$FEELBLU))

fitted_probs_matrix <- matrix(fitted_probs_cum, ncol = n_categories, byrow = TRUE)

fitted_probs_cum <- predict(time_change_model_2, type = "prob")

thresholds <- time_change_model_2$alpha  # Threshold coefficients
beta <- coef(time_change_model_2)  # Fixed effects
X <- model.matrix(time_change_model_2)  # Design matrix for predictors

str(X)

X <- model.matrix(terms(time_change_model_2), data = swan_lng_clean)

str(beta)

common_terms <- intersect(colnames(X), names(beta))
X <- X[, common_terms, drop = FALSE]
beta <- beta[common_terms]

linear_predictor <- X %*% beta

colnames(X)
names(beta)




# Compute the linear predictor
linear_predictor <- X %*% beta

# Extract cumulative probabilities from the model
fitted_probs_cum <- predict(time_change_model_2, type = "response")

# Determine the number of categories
n_categories <- length(levels(swan_lng_clean$FEELBLU))

# Reshape the cumulative probabilities into a matrix
fitted_probs_cum_matrix <- matrix(fitted_probs_cum, ncol = n_categories, byrow = TRUE)

# Calculate fitted values as the most likely category
fitted_values <- apply(fitted_probs_cum_matrix, 1, which.max)

# Inspect the structure and length of fitted_probs_cum
str(fitted_probs_cum)
length(fitted_probs_cum)

# Check the number of rows in the dataset
n_rows <- nrow(swan_lng_clean)

# Check the number of categories in the response variable
n_categories <- length(levels(swan_lng_clean$FEELBLU))

# Compute cumulative probabilities manually
cumulative_probs <- sapply(thresholds, function(thresh) {
  plogis(thresh - linear_predictor)
})

# Reshape into a matrix if necessary
fitted_probs_cum_matrix <- as.matrix(cumulative_probs)

#Check matrix dimension
dim(fitted_probs_cum_matrix)  # Should be (n_rows, n_categories)

# Include all thresholds to calculate probabilities for all categories
thresholds_extended <- c(thresholds, Inf)  # Add a boundary for the last category

# Compute cumulative probabilities for all categories
cumulative_probs <- sapply(thresholds_extended, function(thresh) {
  plogis(thresh - linear_predictor)
})

# Reshape into a matrix
fitted_probs_cum_matrix <- as.matrix(cumulative_probs)

dim(fitted_probs_cum_matrix)  # Should be (13474, 5)

# Initialize matrix for category-specific probabilities
category_probs <- matrix(NA, nrow = 13474, ncol = 5)

# Compute category-specific probabilities
category_probs[, 1] <- fitted_probs_cum_matrix[, 1]  # First category
for (i in 2:5) {
  category_probs[, i] <- fitted_probs_cum_matrix[, i] - fitted_probs_cum_matrix[, i - 1]
}

# Assign column names to match category labels
colnames(category_probs) <- levels(swan_lng_clean$FEELBLU)

# Verify dimensions
dim(category_probs)  # Should still be (13474, 5)


# Define observed categories as numeric
observed <- as.numeric(swan_lng_clean$FEELBLU)

# Compute generalized residuals
generalized_residuals <- qnorm(fitted_probs_cum_matrix[cbind(seq_len(13474), observed)])

# Handle infinite values due to boundary predictions
generalized_residuals[is.infinite(generalized_residuals)] <- NA

# Fitted values as the most likely category
fitted_values <- apply(category_probs, 1, which.max)

#RESIDUAL PLOT (SAVED as IMAGE in folder)

# Save plot as a PNG
png("Residuals_vs_Fitted_Values.png", width = 800, height = 600)
plot(fitted_values, generalized_residuals,
     main = "Residuals vs Fitted Values",
     xlab = "Fitted Values (Predicted Category)",
     ylab = "Generalized Residuals")
abline(h = 0, col = "red")
dev.off()

#Outlier investigation

# Compute generalized residuals for the model
generalized_residuals <- qnorm(fitted_probs_cum_matrix[cbind(seq_len(nrow(swan_lng_clean)), as.numeric(swan_lng_clean$FEELBLU))])

# Replace infinite values with NA
generalized_residuals[is.infinite(generalized_residuals)] <- NA

# Identify outliers: residuals greater than ±3 standard deviations
outlier_threshold <- 3
outliers <- which(abs(generalized_residuals) > outlier_threshold)

# Print the indices and residual values of outliers
cat("Outlier Indices:\n")
print(outliers)
cat("Outlier Residuals:\n")
print(generalized_residuals[outliers])

# Inspect the data points corresponding to these outliers
outlier_data <- swan_lng_clean[outliers, ]
print(outlier_data)


# Q-Q plot for generalized residuals
png("qqplot_generalized_residuals.png", width = 800, height = 600)
qqnorm(generalized_residuals, main = "Q-Q Plot of Generalized Residuals for time_change_model_2")
qqline(generalized_residuals, col = "red")
dev.off()

# Histogram of residuals
png("histogram_generalized_residuals.png", width = 800, height = 600)
hist(generalized_residuals, 
     main = "Histogram of Generalized Residuals for time_change_model_2",
     xlab = "Generalized Residuals", breaks = 30, col = "lightblue")
dev.off()

#LEVERAGE ANALYSIS

# Compute leverage values
# Compute (X^T X)^-1
XtX_inv <- solve(t(X) %*% X)

# Compute the hat matrix H
H <- X %*% XtX_inv %*% t(X)

# Extract leverage values (diagonal of H)
leverage <- diag(H)

# Inspect leverage values
summary(leverage)

mean_leverage <- mean(leverage)
high_leverage_threshold <- 2 * mean_leverage
high_leverage_points <- which(leverage > high_leverage_threshold)

# Print indices of high leverage points
print(high_leverage_points)

# Inspect these points in your dataset
high_leverage_data <- swan_lng_clean[high_leverage_points, ]
print(high_leverage_data)

png("leverage_plot.png", width = 800, height = 600)
plot(leverage, 
     main = "Leverage Values",
     xlab = "Observation Index",
     ylab = "Leverage",
     pch = 19)
dev.off()

# Apply influence diagnostics for the CLMM model
influential <- influence(time_change_model_2, group = "SWANID")

# Inspect the most influential observations
summary(influential)

# Plot influential measures
plot(influential)

# Plot Cook's distance
# Ensure factors have at least two levels
# Remove factors with fewer than two levels
clean_data <- swan_lng_clean[, sapply(swan_lng_clean, function(col) {
  if (is.factor(col)) length(unique(col)) > 1 else TRUE
})]


# Remove predictors with only one unique value
clean_data <- clean_data[, sapply(clean_data, function(col) {
  length(unique(col)) > 1
})]

#Convert the Response Variable to Numeric
clean_data$FEELBLU_numeric <- as.numeric(clean_data$FEELBLU)


# Check levels of factor variables
sapply(clean_data, function(col) {
  if (is.factor(col)) length(unique(col)) else NA
})
# Drop factors with fewer than two levels
clean_data <- clean_data[, sapply(clean_data, function(col) {
  if (is.factor(col)) length(unique(col)) > 1 else TRUE
})]

# Remove rows with NA values
clean_data <- clean_data[complete.cases(clean_data), ]
clean_data <- clean_data[, !names(clean_data) %in% "DRNKBEE"]

clean_data <- clean_data %>%
  mutate(
    SWANID = as.character(SWANID),  # If used as an ID, keep as character
    visit = as.numeric(visit),
    AGE = as.numeric(AGE)
  )

str(clean_data)

clean_data <- clean_data %>%
  mutate(across(where(is.factor), ~ droplevels(.)))

# Convert the response variable
clean_data$FEELBLU_numeric <- as.numeric(clean_data$FEELBLU)

lm_model <- lm(FEELBLU_numeric ~ ., data = clean_data)


# Fit the model
lm_model <- lm(FEELBLU_numeric ~ ., data = clean_data)

# Cook's Distance
cooks_distance <- cooks.distance(lm_model)

# Plot Cook's Distance
# Save Cook's Distance plot
png("Cooks_Distance.png", width = 800, height = 600)
plot(cooks_distance,
     type = "h",
     main = "Cook's Distance",
     xlab = "Observation Index",
     ylab = "Cook's Distance")
abline(h = 4 / nrow(clean_data), col = "red", lty = 2)
dev.off()  # Close the device

# Save Q-Q plot
png("QQ_Plot.png", width = 800, height = 600)
qqnorm(residuals(lm_model), main = "Q-Q Plot of Residuals")
qqline(residuals(lm_model), col = "red")
dev.off()  # Close the device

str(swan_lng_clean)
summary(swan_lng_clean)


####
# Inspect the output of predict()
str(fitted_probs_cum)

# Define the number of categories in the outcome variable
n_categories <- length(attr(time_change_model_2$y, "lev"))

length(fitted_probs_cum) == length(time_change_model_2$y) * n_categories

length(fitted_probs_cum)
head(fitted_probs_cum)

n_categories <- length(attr(time_change_model_2$y, "lev"))
attr(time_change_model_2$y, "lev")

length(time_change_model_2$y)


str(swan_lng_clean$FEELBLU)

levels(swan_lng_clean$FEELBLU)

time_change_model_2$alpha

# Extract model coefficients
beta <- coef(time_change_model_2)  # Fixed effects
thresholds <- time_change_model_2$alpha  # Threshold coefficients

# Recreate the design matrix manually
X <- model.matrix(~ SMOKERE * visit + ALCO24H + ALCHL24R + DRNKBEE +
                    SLEEPQL * visit + RESTLES + SLEEP1M + 
                    DEPRESS * visit + NRVOUS * visit + AVCIGDA +
                    AGE + RACE * visit, 
                  data = swan_lng_clean)



# Extract fixed effects coefficients
beta <- coef(time_change_model_2)

# Match column names
matched_columns <- intersect(names(beta), colnames(X))
X <- X[, matched_columns, drop = FALSE]
beta <- beta[matched_columns]


# Compute linear predictor
linear_predictor <- X %*% beta

# Compute cumulative probabilities
cumulative_probs <- sapply(thresholds, function(thresh) {
  plogis(thresh - linear_predictor)
})

# Convert cumulative probabilities to category-specific probabilities
category_probs <- matrix(NA, nrow = nrow(cumulative_probs), ncol = ncol(cumulative_probs) + 1)
category_probs[, 1] <- cumulative_probs[, 1]
for (i in 2:ncol(cumulative_probs)) {
  category_probs[, i] <- cumulative_probs[, i] - cumulative_probs[, i - 1]
}
category_probs[, ncol(category_probs)] <- 1 - cumulative_probs[, ncol(cumulative_probs)]

# Check the dimensions of category_probs
dim(category_probs)

# Check the levels of the outcome variable
levels(swan_lng_clean$FEELBLU)

# Assign column names to match the levels of FEELBLU
colnames(category_probs) <- levels(swan_lng_clean$FEELBLU)

# Inspect the updated matrix
head(category_probs)

# Confirm column names
colnames(category_probs)

pdf("category_probabilities_histograms.pdf", width = 10, height = 7)  # Adjust dimensions as needed
par(mfrow = c(2, 3))  # 2 rows and 3 columns
for (cat in colnames(category_probs)) {
  hist(category_probs[, cat], 
       main = paste("Predicted Probabilities for", cat), 
       xlab = "Probability", 
       breaks = 30)
}
dev.off()  # Close the PDF device
